1.1 Importing Data

breast_cancer <- as_tibble(read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data", header = FALSE, na = "?", sep = ",", col.names=c("class", "age", "menopause", "tumor_size", "inv_nodes", "node_caps", "deg_malig", "breast", "breast_quad", "irradiat")))

1.2 Finding the age category which has the highest risk of recurrence

Viewing distinct factors for age

unique(breast_cancer$age)
## [1] 30-39 40-49 60-69 50-59 70-79 20-29
## Levels: 20-29 30-39 40-49 50-59 60-69 70-79

Building contingency table for age and recurrence

breast_cancer_table <- table(breast_cancer$age, breast_cancer$class, dnn = c("Age", "Recurrence"))
breast_cancer_table
##        Recurrence
## Age     no-recurrence-events recurrence-events
##   20-29                    1                 0
##   30-39                   21                15
##   40-49                   63                27
##   50-59                   71                25
##   60-69                   40                17
##   70-79                    5                 1

Calculating probability of recurrence by age group

probAge1 <- count(breast_cancer %>% filter(class == "recurrence-events") %>% filter(age == "20-29")) /
  count(breast_cancer %>% filter(age == "20-29"))
probAge2 <- count(breast_cancer %>% filter(class == "recurrence-events") %>% filter(age == "30-39")) /
  count(breast_cancer %>% filter(age == "30-39"))
probAge3 <- count(breast_cancer %>% filter(class == "recurrence-events") %>% filter(age == "40-49")) /
  count(breast_cancer %>% filter(age == "40-49"))
probAge4 <- count(breast_cancer %>% filter(class == "recurrence-events") %>% filter(age == "50-59")) /
  count(breast_cancer %>% filter(age == "50-59"))
probAge5 <- count(breast_cancer %>% filter(class == "recurrence-events") %>% filter(age == "60-69")) /
  count(breast_cancer %>% filter(age == "60-69"))
probAge6 <- count(breast_cancer %>% filter(class == "recurrence-events") %>% filter(age == "70-79")) /
  count(breast_cancer %>% filter(age == "70-79"))

# Creating vector of recurrence probability by age group
recur_probs <- c(probAge1, probAge2, probAge3, probAge4, probAge5, probAge6)

# Creating vector of distinct age groups
ages <- c ("20-29", "30-39", "40-49", "50-59", "60-69", "70-79")

# Creating named vector of recurrence probability by age group
names(recur_probs) <- ages

Determining age category with the highest risk of recurrence

# The age group with the highest risk of recurrence is: 
names(recur_probs)[which.max(recur_probs)]
## [1] "30-39"

1.3 Finding breast with the higher risk of recurrence

Viewing distinct factors for breast

unique(breast_cancer$breast)
## [1] left  right
## Levels: left right

Building contingency table for age and recurrence

breast_cancer_table <- table(breast_cancer$breast, breast_cancer$class, dnn = c("Breast", "Recurrence"))
breast_cancer_table
##        Recurrence
## Breast  no-recurrence-events recurrence-events
##   left                   103                49
##   right                   98                36

Calculating probability of recurrence by breast

probLeft <- count(breast_cancer %>% filter(class == "recurrence-events") %>% filter(breast == "left")) /
  count(breast_cancer %>% filter(breast == "left"))
probRight <- count(breast_cancer %>% filter(class == "recurrence-events") %>% filter(breast == "right")) /
  count(breast_cancer %>% filter(breast == "right"))

# Creating vector of recurrence probability by breast
recur_probs_breasts <- c(probLeft, probRight)

# Creating vector of distinct breasts
breasts <- c ("left", "right")

# Creating named vector of recurrence probability by age group
names(recur_probs_breasts) <- breasts

Determining breast with the higher risk of recurrence

# The age group with the highest risk of recurrence is: 
names(recur_probs_breasts)[which.max(recur_probs_breasts)]
## [1] "left"

1.4 Drop missing values

breast_cancer <- breast_cancer %>% drop_na()

1.5 Rank attributes and plot correlation matrix

Building correlation matrix

# All vectors in breast_cancer dataset are numeric or factors, so no mutations are necessary.
breast_cancer.cor <- hetcor(breast_cancer$class, breast_cancer$age, breast_cancer$menopause, breast_cancer$tumor_size, breast_cancer$inv_nodes, breast_cancer$node_caps, breast_cancer$deg_malig, breast_cancer$breast, breast_cancer$breast_quad, breast_cancer$irradiat)
## Warning in log(P): NaNs produced

## Warning in log(P): NaNs produced

## Warning in log(P): NaNs produced

## Warning in log(P): NaNs produced

## Warning in log(P): NaNs produced

## Warning in log(P): NaNs produced

## Warning in log(P): NaNs produced

Storing correlation matrix with updated column names

# Creating correlation matrix
breast_cancer.corrmat <- as_tibble(breast_cancer.cor$correlations)

# Assigning original column names to correlation matrix
colnames(breast_cancer.corrmat) <- colnames(breast_cancer)

# Viewing correlation matrix
breast_cancer.corrmat
## # A tibble: 10 x 10
##      class     age menopause tumor_size inv_nodes node_caps deg_malig
##      <dbl>   <dbl>     <dbl>      <dbl>     <dbl>     <dbl>     <dbl>
##  1  1      -0.123    0.0995     0.222      0.505     0.481     0.425 
##  2 -0.123   1       -0.884     -0.0214    -0.143    -0.0703   -0.0711
##  3  0.0995 -0.884    1          0.00476    0.0445    0.0455   -0.0588
##  4  0.222  -0.0214   0.00476    1          0.268     0.274     0.177 
##  5  0.505  -0.143    0.0445     0.268      1         0.852     0.444 
##  6  0.481  -0.0703   0.0455     0.274      0.852     1         0.495 
##  7  0.425  -0.0711  -0.0588     0.177      0.444     0.495     1     
##  8 -0.0686 -0.0537   0.0872     0.0316     0.0462   -0.0221   -0.0577
##  9  0.0794 -0.0305   0.0955     0.185      0.0728    0.0936    0.0523
## 10  0.381  -0.109    0.126      0.244      0.530     0.548     0.332 
## # ... with 3 more variables: breast <dbl>, breast_quad <dbl>,
## #   irradiat <dbl>

Adding attribute names in a new column and reordering columns

breast_cancer.corrmat <- breast_cancer.corrmat %>% 
  mutate(attribute=colnames(breast_cancer.corrmat)) %>% 
  select(attribute, class, age, menopause, tumor_size, inv_nodes, node_caps, deg_malig, breast, breast_quad, irradiat)

Printing the ranking

rank <- breast_cancer.corrmat %>% 
  select(attribute, class) %>% 
  mutate(squared.correlation=class^2) %>%
  arrange(desc(squared.correlation)) 

rank
## # A tibble: 10 x 3
##    attribute     class squared.correlation
##    <chr>         <dbl>               <dbl>
##  1 class        1                  1      
##  2 inv_nodes    0.505              0.255  
##  3 node_caps    0.481              0.232  
##  4 deg_malig    0.425              0.180  
##  5 irradiat     0.381              0.146  
##  6 tumor_size   0.222              0.0495 
##  7 age         -0.123              0.0150 
##  8 menopause    0.0995             0.00991
##  9 breast_quad  0.0794             0.00631
## 10 breast      -0.0686             0.00471

Visualizing the correlation matrix in a correlogram

corrplot(breast_cancer.cor$correlations, method="shade", type="upper", tl.col="black", tl.srt=45)

1.6 Decision tree models

1.6.1 Calculation information gain for each regular attribute

Computing entropy of dataset for gain calculations

#Finding counts for entropy calculation
counts <- (breast_cancer %>% 
  select(class) %>%
  group_by(class) %>%
  summarise(count=n())
  )$count

# Calculating entropy and gini
entropy_set <-entropy.gini(counts)

# The entropy for the entire dataset is: 
entropy_set$entropy
## [1] 1

Calculating gain based on age

Looking at distribution in terms of three subsets by age

info.before <- entropy_set$entropy
subsets.age <- table(breast_cancer$age, breast_cancer$class)
subsets.age
##        
##         no-recurrence-events recurrence-events
##   20-29                    1                 0
##   30-39                   21                15
##   40-49                   62                27
##   50-59                   69                22
##   60-69                   38                17
##   70-79                    5                 0

Computing entropy of age subsets

entropy20_29 <-entropy.gini(subsets.age["20-29",], type=2)$entropy

entropy30_39 <-entropy.gini(subsets.age["30-39",], type=2)$entropy

entropy40_49 <-entropy.gini(subsets.age["40-49",], type=2)$entropy
entropy50_59 <-entropy.gini(subsets.age["50-59",], type=2)$entropy

entropy60_69 <-entropy.gini(subsets.age["60-69",], type=2)$entropy

entropy70_79 <-entropy.gini(subsets.age["70-79",], type=2)$entropy

Computing weighted entropy for age

total <- nrow(breast_cancer)
w20_29 <- sum(subsets.age["20-29",])/total
w30_39 <- sum(subsets.age["30-39",])/total
w40_49 <- sum(subsets.age["40-49",])/total
w50_59 <- sum(subsets.age["50-59",])/total
w60_69 <- sum(subsets.age["60-69",])/total
w70_79 <- sum(subsets.age["70-79",])/total
info.age <- w20_29*entropy20_29 + w30_39*entropy30_39 + w40_49*entropy40_49 + w50_59*entropy50_59 + w60_69*entropy60_69 + w70_79*entropy70_79

Computing information gain for age

gain.age <- info.before - info.age
gain.age
## [1] 0.1489036

Calculating gain based on menopause

Looking at distribution in terms of three subsets by menopause

subsets.menopause <- table(breast_cancer$menopause, breast_cancer$class)
subsets.menopause
##          
##           no-recurrence-events recurrence-events
##   ge40                      90                33
##   lt40                       5                 0
##   premeno                  101                48

Computing entropy of menopause subsets

entropy_premeno <-entropy.gini(subsets.menopause["premeno",], type=2)$entropy

entropy_ge40 <-entropy.gini(subsets.menopause["ge40",], type=2)$entropy

entropy_lt40 <-entropy.gini(subsets.menopause["lt40",], type=2)$entropy

Computing weighted entropy for menopause

w_premeno <- sum(subsets.menopause["premeno",])/total
w_ge40 <- sum(subsets.menopause["ge40",])/total
w_lt40 <- sum(subsets.menopause["lt40",])/total
info.menopause <- w_premeno*entropy_premeno + w_ge40*entropy_ge40 + w_lt40*entropy_lt40

Computing information gain for menopause

gain.menopause <- info.before - info.menopause
gain.menopause
## [1] 0.1397255

Calculating gain based on tumor size

Looking at distribution in terms of three subsets by tumor size

subsets.tumor_size <- table(breast_cancer$tumor_size, breast_cancer$class)
subsets.tumor_size
##        
##         no-recurrence-events recurrence-events
##   0-4                      7                 1
##   10-14                   27                 1
##   15-19                   23                 6
##   20-24                   34                14
##   25-29                   33                18
##   30-34                   33                24
##   35-39                   12                 7
##   40-44                   16                 6
##   45-49                    2                 1
##   5-9                      4                 0
##   50-54                    5                 3

Computing entropy of tumor size subsets

entropy_ts01 <-entropy.gini(subsets.tumor_size["0-4",], type=2)$entropy

entropy_ts02 <-entropy.gini(subsets.tumor_size["5-9",], type=2)$entropy

entropy_ts03 <-entropy.gini(subsets.tumor_size["10-14",], type=2)$entropy
entropy_ts04 <-entropy.gini(subsets.tumor_size["15-19",], type=2)$entropy

entropy_ts05 <-entropy.gini(subsets.tumor_size["20-24",], type=2)$entropy
entropy_ts06 <-entropy.gini(subsets.tumor_size["25-29",], type=2)$entropy

entropy_ts07 <-entropy.gini(subsets.tumor_size["30-34",], type=2)$entropy

entropy_ts08 <-entropy.gini(subsets.tumor_size["35-39",], type=2)$entropy

entropy_ts09 <-entropy.gini(subsets.tumor_size["40-44",], type=2)$entropy

entropy_ts10 <-entropy.gini(subsets.tumor_size["45-49",], type=2)$entropy

entropy_ts11 <-entropy.gini(subsets.tumor_size["50-54",], type=2)$entropy

Computing weighted entropy for tumor_size

w_ts01 <- sum(subsets.tumor_size["0-4",])/total
w_ts02 <- sum(subsets.tumor_size["5-9",])/total
w_ts03 <- sum(subsets.tumor_size["10-14",])/total
w_ts04 <- sum(subsets.tumor_size["15-19",])/total
w_ts05 <- sum(subsets.tumor_size["20-24",])/total
w_ts06 <- sum(subsets.tumor_size["25-29",])/total
w_ts07 <- sum(subsets.tumor_size["30-34",])/total
w_ts08 <- sum(subsets.tumor_size["35-39",])/total
w_ts09 <- sum(subsets.tumor_size["40-44",])/total
w_ts10 <- sum(subsets.tumor_size["45-49",])/total
w_ts11 <- sum(subsets.tumor_size["50-54",])/total
info.tumor_size <- w_ts01*entropy_ts01 + w_ts02*entropy_ts02 + w_ts03*entropy_ts03 + w_ts04*entropy_ts04 + w_ts05*entropy_ts05 + w_ts06*entropy_ts06 + w_ts07*entropy_ts07 + w_ts08*entropy_ts08 + w_ts09*entropy_ts09 + w_ts10*entropy_ts10 + w_ts11*entropy_ts11

Computing information gain for tumor_size

gain.tumor_size <- info.before - info.tumor_size
gain.tumor_size
## [1] 0.1896315

Calculating gain based on inv_nodes

Looking at distribution in terms of three subsets by inv_nodes

subsets.inv_nodes <- table(breast_cancer$inv_nodes, breast_cancer$class)
subsets.inv_nodes
##        
##         no-recurrence-events recurrence-events
##   0-2                    166                43
##   12-14                    1                 2
##   15-17                    3                 3
##   24-26                    0                 1
##   3-5                     17                17
##   6-8                      7                10
##   9-11                     2                 5

Computing entropy of inv_nodes subsein

entropy_in01 <-entropy.gini(subsets.inv_nodes["0-2",], type=2)$entropy

entropy_in02 <-entropy.gini(subsets.inv_nodes["3-5",], type=2)$entropy

entropy_in03 <-entropy.gini(subsets.inv_nodes["6-8",], type=2)$entropy

entropy_in04 <-entropy.gini(subsets.inv_nodes["9-11",], type=2)$entropy

entropy_in05 <-entropy.gini(subsets.inv_nodes["12-14",], type=2)$entropy

entropy_in06 <-entropy.gini(subsets.inv_nodes["15-17",], type=2)$entropy

entropy_in07 <-entropy.gini(subsets.inv_nodes["24-26",], type=2)$entropy

Computing weighted entropy for inv_nodes

w_in01 <- sum(subsets.inv_nodes["0-2",])/total
w_in02 <- sum(subsets.inv_nodes["3-5",])/total
w_in03 <- sum(subsets.inv_nodes["6-8",])/total
w_in04 <- sum(subsets.inv_nodes["9-11",])/total
w_in05 <- sum(subsets.inv_nodes["12-14",])/total
w_in06 <- sum(subsets.inv_nodes["15-17",])/total
w_in07 <- sum(subsets.inv_nodes["24-26",])/total
info.inv_nodes <- w_in01*entropy_in01 + w_in02*entropy_in02 + w_in03*entropy_in03 + w_in04*entropy_in04 + w_in05*entropy_in05 + w_in06*entropy_in06 + w_in07*entropy_in07

Computing information gain for inv_nodes

gain.inv_nodes <- info.before - info.inv_nodes
gain.inv_nodes
## [1] 0.2105958

Calculating gain based on node_caps

Looking at distribution in terms of three subsets by node_caps

subsets.node_caps <- table(breast_cancer$node_caps, breast_cancer$class)
subsets.node_caps
##      
##       no-recurrence-events recurrence-events
##   no                   171                50
##   yes                   25                31

Computing entropy of node_caps subsets

entropy_ncno <-entropy.gini(subsets.node_caps["no",], type=2)$entropy

entropy_ncyes <-entropy.gini(subsets.node_caps["yes",], type=2)$entropy

Computing weighted entropy for node_caps

w_ncno <- sum(subsets.node_caps["no",])/total
w_ncyes <- sum(subsets.node_caps["yes",])/total
info.node_caps <- w_ncno*entropy_ncno + w_ncyes*entropy_ncyes

Computing information gain for node_caps

gain.node_caps <- info.before - info.node_caps
gain.node_caps
## [1] 0.1840568

Calculating gain based on deg_malig

Looking at distribution in terms of three subsets by deg_malig

subsets.deg_malig <- table(breast_cancer$deg_malig, breast_cancer$class)
subsets.deg_malig
##    
##     no-recurrence-events recurrence-events
##   1                   57                 9
##   2                  101                28
##   3                   38                44

Computing entropy of deg_malig subsets

entropy_dm1 <-entropy.gini(subsets.deg_malig["1",], type=2)$entropy

entropy_dm2 <-entropy.gini(subsets.deg_malig["2",], type=2)$entropy

entropy_dm3 <-entropy.gini(subsets.deg_malig["3",], type=2)$entropy

Computing weighted entropy for deg_malig

w_dm1 <- sum(subsets.deg_malig["1",])/total
w_dm2 <- sum(subsets.deg_malig["2",])/total
w_dm3 <- sum(subsets.deg_malig["3",])/total
info.deg_malig <- w_dm1*entropy_dm1 + w_dm2*entropy_dm2 + w_dm3*entropy_dm3

Computing information gain for deg_malig

gain.deg_malig <- info.before - info.deg_malig
gain.deg_malig
## [1] 0.2167076

Calculating gain based on breast

Looking at distribution in terms of three subsets by breast

subsets.breast <- table(breast_cancer$breast, breast_cancer$class)
subsets.breast
##        
##         no-recurrence-events recurrence-events
##   left                   100                45
##   right                   96                36

Computing entropy of breast subsets

entropy_left <-entropy.gini(subsets.breast["left",], type=2)$entropy

entropy_right <-entropy.gini(subsets.breast["right",], type=2)$entropy

Computing weighted entropy for breast

w_left <- sum(subsets.breast["left",])/total
w_right <- sum(subsets.breast["right",])/total
info.breast <- w_left*entropy_left + w_right*entropy_right

Computing information gain for breast

gain.breast <- info.before - info.breast
gain.breast
## [1] 0.1294075

Calculating gain based on breast quadrant

Looking at distribution in terms of three subsets by breast_quad

subsets.breast_quad <- table(breast_cancer$breast_quad, breast_cancer$class)
subsets.breast_quad
##            
##             no-recurrence-events recurrence-events
##   central                     17                 4
##   left_low                    73                33
##   left_up                     69                25
##   right_low                   17                 6
##   right_up                    20                13

Computing entropy of breast quadrant subsets

entropy_bq01 <-entropy.gini(subsets.breast_quad["central",], type=2)$entropy

entropy_bq02 <-entropy.gini(subsets.breast_quad["left_low",], type=2)$entropy
entropy_bq03 <-entropy.gini(subsets.breast_quad["left_up",], type=2)$entropy

entropy_bq04 <-entropy.gini(subsets.breast_quad["right_low",], type=2)$entropy

entropy_bq05 <-entropy.gini(subsets.breast_quad["right_up",], type=2)$entropy

Computing weighted entropy for breast quadrant

w_bq01 <- sum(subsets.breast_quad["central",])/total
w_bq02 <- sum(subsets.breast_quad["left_low",])/total
w_bq03 <- sum(subsets.breast_quad["left_up",])/total
w_bq04 <- sum(subsets.breast_quad["right_low",])/total
w_bq05 <- sum(subsets.breast_quad["right_up",])/total
info.breast_quad <- w_bq01*entropy_bq01 + w_bq02*entropy_bq02 + w_bq03*entropy_bq03 + w_bq04*entropy_bq04 + w_bq05*entropy_bq05

Computing information gain for breast quadrant

gain.breast_quad <- info.before - info.breast_quad
gain.breast_quad
## [1] 0.1368161

Calculating gain based on irradiation

Looking at distribution in terms of three subsets by irradiation

subsets.irradiat <- table(breast_cancer$irradiat, breast_cancer$class)
subsets.irradiat
##      
##       no-recurrence-events recurrence-events
##   no                   164                51
##   yes                   32                30

Computing entropy of irradiation subsets

entropy_irrno <-entropy.gini(subsets.irradiat["no",], type=2)$entropy

entropy_irryes <-entropy.gini(subsets.irradiat["yes",], type=2)$entropy

Computing weighted entropy for irradiation

w_irrno <- sum(subsets.irradiat["no",])/total
w_irryes <- sum(subsets.irradiat["yes",])/total
info.irradiat <- w_irrno*entropy_irrno + w_irryes*entropy_irryes

Computing information gain for irradiation

gain.irradiat <- info.before - info.irradiat
gain.irradiat
## [1] 0.1628784

Results: Choosing root node

# Creating vector of gains for evaluation
gains <- c(gain.age, gain.breast, gain.breast_quad, gain.deg_malig, gain.inv_nodes, gain.irradiat, gain.menopause, gain.node_caps, gain.tumor_size)

# Creating vector of names
gain_names <- c("age", "breast", "breast_quad", "deg_malig", "inv_nodes", "irradiat", "menopause", "node_caps", "tumor_size")

# Creating named vector of gains
names(gains) <- gain_names

# Calculating maximum information gain
# The attribute with the maximum information gain, which will serve as the root node in our decision tree, is:
names(gains)[which.max(gains)]
## [1] "deg_malig"

1.6.2 Fitting a single tree model using rpart

Decision tree using default settings

set.seed(20)
fit <- rpart(deg_malig~age+breast+breast_quad+inv_nodes+irradiat+menopause+node_caps+tumor_size,
      method="class", data=breast_cancer,
      control=rpart.control(),
      parms=list(split='information'))
summary(fit)
## Call:
## rpart(formula = deg_malig ~ age + breast + breast_quad + inv_nodes + 
##     irradiat + menopause + node_caps + tumor_size, data = breast_cancer, 
##     method = "class", parms = list(split = "information"), control = rpart.control())
##   n= 277 
## 
##           CP nsplit rel error   xerror       xstd
## 1 0.04391892      0 1.0000000 1.000000 0.05609499
## 2 0.02027027      2 0.9121622 1.020270 0.05599795
## 3 0.01351351      3 0.8918919 1.114865 0.05518860
## 4 0.01126126     10 0.7905405 1.168919 0.05445505
## 5 0.01000000     13 0.7567568 1.148649 0.05475386
## 
## Variable importance
##   inv_nodes  tumor_size   node_caps         age   menopause breast_quad 
##          26          23          15          12           9           9 
##      breast    irradiat 
##           5           2 
## 
## Node number 1: 277 observations,    complexity param=0.04391892
##   predicted class=2  expected loss=0.534296  P(node) =1
##     class counts:    66   129    82
##    probabilities: 0.238 0.466 0.296 
##   left son=2 (209 obs) right son=3 (68 obs)
##   Primary splits:
##       inv_nodes  splits as  LRRRRRR,     improve=20.974280, (0 missing)
##       node_caps  splits as  LR,          improve=20.760610, (0 missing)
##       tumor_size splits as  LLRRRRRRRLR, improve= 9.751465, (0 missing)
##       irradiat   splits as  LR,          improve= 9.098196, (0 missing)
##       menopause  splits as  RRL,         improve= 2.881789, (0 missing)
##   Surrogate splits:
##       node_caps splits as  LR, agree=0.892, adj=0.559, (0 split)
##       irradiat  splits as  LR, agree=0.769, adj=0.059, (0 split)
## 
## Node number 2: 209 observations,    complexity param=0.01351351
##   predicted class=2  expected loss=0.5358852  P(node) =0.7545126
##     class counts:    65    97    47
##    probabilities: 0.311 0.464 0.225 
##   left son=4 (73 obs) right son=5 (136 obs)
##   Primary splits:
##       tumor_size  splits as  LLLRRRRRLLL, improve=7.481480, (0 missing)
##       node_caps   splits as  LR,          improve=3.733983, (0 missing)
##       breast_quad splits as  LLRLR,       improve=3.080368, (0 missing)
##       irradiat    splits as  LR,          improve=3.035000, (0 missing)
##       menopause   splits as  RRL,         improve=1.196347, (0 missing)
##   Surrogate splits:
##       breast_quad splits as  LRRRR, agree=0.660, adj=0.027, (0 split)
##       menopause   splits as  RLR,   agree=0.656, adj=0.014, (0 split)
## 
## Node number 3: 68 observations,    complexity param=0.04391892
##   predicted class=3  expected loss=0.4852941  P(node) =0.2454874
##     class counts:     1    32    35
##    probabilities: 0.015 0.471 0.515 
##   left son=6 (36 obs) right son=7 (32 obs)
##   Primary splits:
##       tumor_size  splits as  -RRLRLRRR-L, improve=4.914226, (0 missing)
##       inv_nodes   splits as  -RRRLLL,     improve=3.993382, (0 missing)
##       menopause   splits as  R-L,         improve=2.885781, (0 missing)
##       breast      splits as  RL,          improve=2.874078, (0 missing)
##       breast_quad splits as  LRLRL,       improve=2.874078, (0 missing)
##   Surrogate splits:
##       inv_nodes splits as  -RRLLRR, agree=0.632, adj=0.219, (0 split)
##       irradiat  splits as  LR,      agree=0.603, adj=0.156, (0 split)
##       age       splits as  -LLRR-,  agree=0.559, adj=0.063, (0 split)
##       node_caps splits as  LR,      agree=0.544, adj=0.031, (0 split)
## 
## Node number 4: 73 observations,    complexity param=0.01126126
##   predicted class=2  expected loss=0.4657534  P(node) =0.2635379
##     class counts:    28    39     6
##    probabilities: 0.384 0.534 0.082 
##   left son=8 (42 obs) right son=9 (31 obs)
##   Primary splits:
##       breast_quad splits as  RLRLL,       improve=2.319334, (0 missing)
##       tumor_size  splits as  RLR-----LLR, improve=2.233882, (0 missing)
##       irradiat    splits as  LR,          improve=1.246302, (0 missing)
##       age         splits as  -LRLLL,      improve=1.116546, (0 missing)
##       menopause   splits as  LRR,         improve=1.096317, (0 missing)
##   Surrogate splits:
##       breast     splits as  LR,          agree=0.630, adj=0.129, (0 split)
##       node_caps  splits as  LR,          agree=0.616, adj=0.097, (0 split)
##       tumor_size splits as  RLL-----LLL, agree=0.603, adj=0.065, (0 split)
##       age        splits as  -LLLRL,      agree=0.589, adj=0.032, (0 split)
##       irradiat   splits as  LR,          agree=0.589, adj=0.032, (0 split)
## 
## Node number 5: 136 observations,    complexity param=0.01351351
##   predicted class=2  expected loss=0.5735294  P(node) =0.4909747
##     class counts:    37    58    41
##    probabilities: 0.272 0.426 0.301 
##   left son=10 (87 obs) right son=11 (49 obs)
##   Primary splits:
##       tumor_size  splits as  ---LLRRL---, improve=2.4450710, (0 missing)
##       irradiat    splits as  LR,          improve=2.4340000, (0 missing)
##       menopause   splits as  RRL,         improve=1.5358650, (0 missing)
##       breast_quad splits as  RRRLR,       improve=1.3153470, (0 missing)
##       age         splits as  LRLRRR,      improve=0.9967619, (0 missing)
##   Surrogate splits:
##       menopause   splits as  LRL,    agree=0.654, adj=0.041, (0 split)
##       node_caps   splits as  LR,     agree=0.654, adj=0.041, (0 split)
##       age         splits as  RLLLLL, agree=0.647, adj=0.020, (0 split)
##       breast_quad splits as  LLLLR,  agree=0.647, adj=0.020, (0 split)
## 
## Node number 6: 36 observations,    complexity param=0.02027027
##   predicted class=2  expected loss=0.3611111  P(node) =0.1299639
##     class counts:     0    23    13
##    probabilities: 0.000 0.639 0.361 
##   left son=12 (21 obs) right son=13 (15 obs)
##   Primary splits:
##       menopause   splits as  R-L,         improve=3.2256310, (0 missing)
##       tumor_size  splits as  ---L-R----L, improve=1.5076650, (0 missing)
##       breast_quad splits as  LRRRL,       improve=1.0070870, (0 missing)
##       breast      splits as  RL,          improve=0.5455044, (0 missing)
##       age         splits as  -LLRR-,      improve=0.3636598, (0 missing)
##   Surrogate splits:
##       age         splits as  -LLRR-,  agree=0.861, adj=0.667, (0 split)
##       breast_quad splits as  LLLRL,   agree=0.667, adj=0.200, (0 split)
##       breast      splits as  RL,      agree=0.639, adj=0.133, (0 split)
##       inv_nodes   splits as  -LLRLLL, agree=0.611, adj=0.067, (0 split)
## 
## Node number 7: 32 observations
##   predicted class=3  expected loss=0.3125  P(node) =0.1155235
##     class counts:     1     9    22
##    probabilities: 0.031 0.281 0.688 
## 
## Node number 8: 42 observations,    complexity param=0.01126126
##   predicted class=2  expected loss=0.4285714  P(node) =0.1516245
##     class counts:    17    24     1
##    probabilities: 0.405 0.571 0.024 
##   left son=16 (17 obs) right son=17 (25 obs)
##   Primary splits:
##       age         splits as  -LRRLL,      improve=2.3409030, (0 missing)
##       tumor_size  splits as  LLR-----LRR, improve=1.6696220, (0 missing)
##       menopause   splits as  LLR,         improve=0.7583689, (0 missing)
##       breast_quad splits as  -R-LL,       improve=0.5940572, (0 missing)
##       breast      splits as  LR,          improve=0.5723340, (0 missing)
##   Surrogate splits:
##       menopause   splits as  LRR,   agree=0.643, adj=0.118, (0 split)
##       breast_quad splits as  -R-LR, agree=0.619, adj=0.059, (0 split)
## 
## Node number 9: 31 observations,    complexity param=0.01126126
##   predicted class=2  expected loss=0.516129  P(node) =0.1119134
##     class counts:    11    15     5
##    probabilities: 0.355 0.484 0.161 
##   left son=18 (12 obs) right son=19 (19 obs)
##   Primary splits:
##       tumor_size  splits as  RLR------LR, improve=3.9323710, (0 missing)
##       age         splits as  -RRLRR,      improve=2.3978190, (0 missing)
##       menopause   splits as  LRR,         improve=1.7459430, (0 missing)
##       breast      splits as  LR,          improve=0.3546232, (0 missing)
##       breast_quad splits as  L-R--,       improve=0.2327571, (0 missing)
##   Surrogate splits:
##       age    splits as  -RLRRL, agree=0.677, adj=0.167, (0 split)
##       breast splits as  LR,     agree=0.645, adj=0.083, (0 split)
## 
## Node number 10: 87 observations,    complexity param=0.01351351
##   predicted class=2  expected loss=0.5057471  P(node) =0.3140794
##     class counts:    22    43    22
##    probabilities: 0.253 0.494 0.253 
##   left son=20 (50 obs) right son=21 (37 obs)
##   Primary splits:
##       menopause   splits as  R-L,    improve=4.0142610, (0 missing)
##       breast_quad splits as  LLRLR,  improve=2.5566890, (0 missing)
##       irradiat    splits as  LR,     improve=2.1960260, (0 missing)
##       age         splits as  -LLLRR, improve=1.3840990, (0 missing)
##       breast      splits as  RL,     improve=0.4670241, (0 missing)
##   Surrogate splits:
##       age         splits as  -LLRRR,      agree=0.816, adj=0.568, (0 split)
##       breast_quad splits as  LLRLL,       agree=0.609, adj=0.081, (0 split)
##       tumor_size  splits as  ---RL--L---, agree=0.598, adj=0.054, (0 split)
## 
## Node number 11: 49 observations,    complexity param=0.01351351
##   predicted class=3  expected loss=0.6122449  P(node) =0.1768953
##     class counts:    15    15    19
##    probabilities: 0.306 0.306 0.388 
##   left son=22 (30 obs) right son=23 (19 obs)
##   Primary splits:
##       breast_quad splits as  RRLLL,       improve=1.7055780, (0 missing)
##       irradiat    splits as  LR,          improve=1.2179070, (0 missing)
##       tumor_size  splits as  -----LR----, improve=1.0609000, (0 missing)
##       age         splits as  LRRRL-,      improve=0.7532395, (0 missing)
##       menopause   splits as  LRR,         improve=0.4104658, (0 missing)
##   Surrogate splits:
##       breast splits as  RL, agree=0.653, adj=0.105, (0 split)
## 
## Node number 12: 21 observations
##   predicted class=2  expected loss=0.1904762  P(node) =0.07581227
##     class counts:     0    17     4
##    probabilities: 0.000 0.810 0.190 
## 
## Node number 13: 15 observations
##   predicted class=3  expected loss=0.4  P(node) =0.05415162
##     class counts:     0     6     9
##    probabilities: 0.000 0.400 0.600 
## 
## Node number 16: 17 observations
##   predicted class=1  expected loss=0.4117647  P(node) =0.06137184
##     class counts:    10     7     0
##    probabilities: 0.588 0.412 0.000 
## 
## Node number 17: 25 observations
##   predicted class=2  expected loss=0.32  P(node) =0.09025271
##     class counts:     7    17     1
##    probabilities: 0.280 0.680 0.040 
## 
## Node number 18: 12 observations
##   predicted class=1  expected loss=0.4166667  P(node) =0.0433213
##     class counts:     7     5     0
##    probabilities: 0.583 0.417 0.000 
## 
## Node number 19: 19 observations
##   predicted class=2  expected loss=0.4736842  P(node) =0.06859206
##     class counts:     4    10     5
##    probabilities: 0.211 0.526 0.263 
## 
## Node number 20: 50 observations
##   predicted class=2  expected loss=0.42  P(node) =0.1805054
##     class counts:    14    29     7
##    probabilities: 0.280 0.580 0.140 
## 
## Node number 21: 37 observations,    complexity param=0.01351351
##   predicted class=3  expected loss=0.5945946  P(node) =0.133574
##     class counts:     8    14    15
##    probabilities: 0.216 0.378 0.405 
##   left son=42 (14 obs) right son=43 (23 obs)
##   Primary splits:
##       breast_quad splits as  LLRRR,       improve=1.7855190, (0 missing)
##       breast      splits as  RL,          improve=0.8710870, (0 missing)
##       age         splits as  --RRLL,      improve=0.7952993, (0 missing)
##       tumor_size  splits as  ---RR--L---, improve=0.2003243, (0 missing)
##   Surrogate splits:
##       age splits as  --RRLR, agree=0.649, adj=0.071, (0 split)
## 
## Node number 22: 30 observations,    complexity param=0.01351351
##   predicted class=2  expected loss=0.6  P(node) =0.1083032
##     class counts:     8    12    10
##    probabilities: 0.267 0.400 0.333 
##   left son=44 (9 obs) right son=45 (21 obs)
##   Primary splits:
##       age         splits as  LLRRL-,      improve=1.6322740, (0 missing)
##       menopause   splits as  LRR,         improve=0.9080533, (0 missing)
##       breast_quad splits as  --LRR,       improve=0.6437860, (0 missing)
##       tumor_size  splits as  -----LR----, improve=0.6418279, (0 missing)
##       breast      splits as  LR,          improve=0.4714349, (0 missing)
## 
## Node number 23: 19 observations
##   predicted class=3  expected loss=0.5263158  P(node) =0.06859206
##     class counts:     7     3     9
##    probabilities: 0.368 0.158 0.474 
## 
## Node number 42: 14 observations
##   predicted class=2  expected loss=0.5  P(node) =0.05054152
##     class counts:     4     7     3
##    probabilities: 0.286 0.500 0.214 
## 
## Node number 43: 23 observations,    complexity param=0.01351351
##   predicted class=3  expected loss=0.4782609  P(node) =0.08303249
##     class counts:     4     7    12
##    probabilities: 0.174 0.304 0.522 
##   left son=86 (11 obs) right son=87 (12 obs)
##   Primary splits:
##       breast     splits as  RL,          improve=2.735390, (0 missing)
##       tumor_size splits as  ---LR--R---, improve=2.650165, (0 missing)
##       age        splits as  --RRLL,      improve=1.319145, (0 missing)
##   Surrogate splits:
##       breast_quad splits as  --LRR,       agree=0.609, adj=0.182, (0 split)
##       age         splits as  --RRRL,      agree=0.565, adj=0.091, (0 split)
##       node_caps   splits as  RL,          agree=0.565, adj=0.091, (0 split)
##       tumor_size  splits as  ---RR--L---, agree=0.565, adj=0.091, (0 split)
## 
## Node number 44: 9 observations
##   predicted class=2  expected loss=0.4444444  P(node) =0.03249097
##     class counts:     3     5     1
##    probabilities: 0.333 0.556 0.111 
## 
## Node number 45: 21 observations
##   predicted class=3  expected loss=0.5714286  P(node) =0.07581227
##     class counts:     5     7     9
##    probabilities: 0.238 0.333 0.429 
## 
## Node number 86: 11 observations
##   predicted class=2  expected loss=0.5454545  P(node) =0.03971119
##     class counts:     3     5     3
##    probabilities: 0.273 0.455 0.273 
## 
## Node number 87: 12 observations
##   predicted class=3  expected loss=0.25  P(node) =0.0433213
##     class counts:     1     2     9
##    probabilities: 0.083 0.167 0.750
rpart.plot(fit,type=4,extra=2,roundint=FALSE)

Decision tree using custom settings

set.seed(20)
fit <- rpart(deg_malig~age+breast+breast_quad+inv_nodes+irradiat+menopause+node_caps+tumor_size,
      method="class", data=breast_cancer,
      control=rpart.control(minsplit=4,cp=0.01),
      parms=list(split='information'))
summary(fit)
## Call:
## rpart(formula = deg_malig ~ age + breast + breast_quad + inv_nodes + 
##     irradiat + menopause + node_caps + tumor_size, data = breast_cancer, 
##     method = "class", parms = list(split = "information"), control = rpart.control(minsplit = 4, 
##         cp = 0.01))
##   n= 277 
## 
##           CP nsplit rel error   xerror       xstd
## 1 0.02702703      0 1.0000000 1.000000 0.05609499
## 2 0.01351351      4 0.8918919 1.013514 0.05603326
## 3 0.01216216      9 0.8243243 1.074324 0.05560809
## 4 0.01126126     14 0.7635135 1.121622 0.05510791
## 5 0.01013514     18 0.7162162 1.121622 0.05510791
## 6 0.01000000     20 0.6959459 1.121622 0.05510791
## 
## Variable importance
##  tumor_size   inv_nodes breast_quad   node_caps         age      breast 
##          27          21          13          11          10           8 
##    irradiat   menopause 
##           5           5 
## 
## Node number 1: 277 observations,    complexity param=0.02702703
##   predicted class=2  expected loss=0.534296  P(node) =1
##     class counts:    66   129    82
##    probabilities: 0.238 0.466 0.296 
##   left son=2 (209 obs) right son=3 (68 obs)
##   Primary splits:
##       inv_nodes  splits as  LRRRRRR,     improve=20.974280, (0 missing)
##       node_caps  splits as  LR,          improve=20.760610, (0 missing)
##       tumor_size splits as  LLRRRRRRRLR, improve= 9.751465, (0 missing)
##       irradiat   splits as  LR,          improve= 9.098196, (0 missing)
##       menopause  splits as  RRL,         improve= 2.881789, (0 missing)
##   Surrogate splits:
##       node_caps splits as  LR, agree=0.892, adj=0.559, (0 split)
##       irradiat  splits as  LR, agree=0.769, adj=0.059, (0 split)
## 
## Node number 2: 209 observations,    complexity param=0.01351351
##   predicted class=2  expected loss=0.5358852  P(node) =0.7545126
##     class counts:    65    97    47
##    probabilities: 0.311 0.464 0.225 
##   left son=4 (73 obs) right son=5 (136 obs)
##   Primary splits:
##       tumor_size  splits as  LLLRRRRRLLL, improve=7.481480, (0 missing)
##       node_caps   splits as  LR,          improve=3.733983, (0 missing)
##       breast_quad splits as  LLRLR,       improve=3.080368, (0 missing)
##       irradiat    splits as  LR,          improve=3.035000, (0 missing)
##       menopause   splits as  RRL,         improve=1.196347, (0 missing)
##   Surrogate splits:
##       breast_quad splits as  LRRRR, agree=0.660, adj=0.027, (0 split)
##       menopause   splits as  RLR,   agree=0.656, adj=0.014, (0 split)
## 
## Node number 3: 68 observations,    complexity param=0.02702703
##   predicted class=3  expected loss=0.4852941  P(node) =0.2454874
##     class counts:     1    32    35
##    probabilities: 0.015 0.471 0.515 
##   left son=6 (62 obs) right son=7 (6 obs)
##   Primary splits:
##       tumor_size  splits as  -RRLLLLLR-L, improve=5.939564, (0 missing)
##       inv_nodes   splits as  -RRRLLL,     improve=3.993382, (0 missing)
##       menopause   splits as  R-L,         improve=2.885781, (0 missing)
##       breast      splits as  RL,          improve=2.874078, (0 missing)
##       breast_quad splits as  LRLRL,       improve=2.874078, (0 missing)
## 
## Node number 4: 73 observations,    complexity param=0.01126126
##   predicted class=2  expected loss=0.4657534  P(node) =0.2635379
##     class counts:    28    39     6
##    probabilities: 0.384 0.534 0.082 
##   left son=8 (42 obs) right son=9 (31 obs)
##   Primary splits:
##       breast_quad splits as  RLRLL,       improve=2.319334, (0 missing)
##       tumor_size  splits as  RLR-----LLR, improve=2.233882, (0 missing)
##       node_caps   splits as  LR,          improve=1.921021, (0 missing)
##       irradiat    splits as  LR,          improve=1.246302, (0 missing)
##       age         splits as  -LRLLL,      improve=1.116546, (0 missing)
##   Surrogate splits:
##       breast     splits as  LR,          agree=0.630, adj=0.129, (0 split)
##       node_caps  splits as  LR,          agree=0.616, adj=0.097, (0 split)
##       tumor_size splits as  RLL-----LLL, agree=0.603, adj=0.065, (0 split)
##       age        splits as  -LLLRL,      agree=0.589, adj=0.032, (0 split)
##       irradiat   splits as  LR,          agree=0.589, adj=0.032, (0 split)
## 
## Node number 5: 136 observations,    complexity param=0.01351351
##   predicted class=2  expected loss=0.5735294  P(node) =0.4909747
##     class counts:    37    58    41
##    probabilities: 0.272 0.426 0.301 
##   left son=10 (87 obs) right son=11 (49 obs)
##   Primary splits:
##       tumor_size splits as  ---LLRRL---, improve=2.445071, (0 missing)
##       irradiat   splits as  LR,          improve=2.434000, (0 missing)
##       node_caps  splits as  LR,          improve=2.052153, (0 missing)
##       age        splits as  RRRRRL,      improve=1.927031, (0 missing)
##       menopause  splits as  RRL,         improve=1.535865, (0 missing)
##   Surrogate splits:
##       menopause   splits as  LRL,    agree=0.654, adj=0.041, (0 split)
##       node_caps   splits as  LR,     agree=0.654, adj=0.041, (0 split)
##       age         splits as  RLLLLL, agree=0.647, adj=0.020, (0 split)
##       breast_quad splits as  LLLLR,  agree=0.647, adj=0.020, (0 split)
## 
## Node number 6: 62 observations,    complexity param=0.02702703
##   predicted class=2  expected loss=0.483871  P(node) =0.2238267
##     class counts:     0    32    30
##    probabilities: 0.000 0.516 0.484 
##   left son=12 (15 obs) right son=13 (47 obs)
##   Primary splits:
##       tumor_size  splits as  ---LRRRR--L, improve=3.382129, (0 missing)
##       inv_nodes   splits as  -RRRLLL,     improve=3.093532, (0 missing)
##       menopause   splits as  R-L,         improve=2.619234, (0 missing)
##       breast_quad splits as  LRLRL,       improve=2.090369, (0 missing)
##       breast      splits as  RL,          improve=1.582488, (0 missing)
##   Surrogate splits:
##       inv_nodes splits as  -RRLRRR, agree=0.774, adj=0.067, (0 split)
## 
## Node number 7: 6 observations
##   predicted class=3  expected loss=0.1666667  P(node) =0.02166065
##     class counts:     1     0     5
##    probabilities: 0.167 0.000 0.833 
## 
## Node number 8: 42 observations,    complexity param=0.01126126
##   predicted class=2  expected loss=0.4285714  P(node) =0.1516245
##     class counts:    17    24     1
##    probabilities: 0.405 0.571 0.024 
##   left son=16 (17 obs) right son=17 (25 obs)
##   Primary splits:
##       age         splits as  -LRRLL,      improve=2.3409030, (0 missing)
##       tumor_size  splits as  LLR-----LRR, improve=1.6696220, (0 missing)
##       menopause   splits as  LLR,         improve=0.7583689, (0 missing)
##       breast_quad splits as  -R-LL,       improve=0.5940572, (0 missing)
##       breast      splits as  LR,          improve=0.5723340, (0 missing)
##   Surrogate splits:
##       menopause   splits as  LRR,   agree=0.643, adj=0.118, (0 split)
##       breast_quad splits as  -R-LR, agree=0.619, adj=0.059, (0 split)
## 
## Node number 9: 31 observations,    complexity param=0.01126126
##   predicted class=2  expected loss=0.516129  P(node) =0.1119134
##     class counts:    11    15     5
##    probabilities: 0.355 0.484 0.161 
##   left son=18 (12 obs) right son=19 (19 obs)
##   Primary splits:
##       tumor_size splits as  RLR------LR, improve=3.932371, (0 missing)
##       irradiat   splits as  LR,          improve=3.181426, (0 missing)
##       age        splits as  -RRLRR,      improve=2.397819, (0 missing)
##       menopause  splits as  LRL,         improve=1.915670, (0 missing)
##       node_caps  splits as  LR,          improve=1.463911, (0 missing)
##   Surrogate splits:
##       age    splits as  -RLRRL, agree=0.677, adj=0.167, (0 split)
##       breast splits as  LR,     agree=0.645, adj=0.083, (0 split)
## 
## Node number 10: 87 observations,    complexity param=0.01216216
##   predicted class=2  expected loss=0.5057471  P(node) =0.3140794
##     class counts:    22    43    22
##    probabilities: 0.253 0.494 0.253 
##   left son=20 (50 obs) right son=21 (37 obs)
##   Primary splits:
##       menopause   splits as  R-L,    improve=4.014261, (0 missing)
##       breast_quad splits as  LLRLR,  improve=2.556689, (0 missing)
##       age         splits as  -RRRRL, improve=2.279601, (0 missing)
##       irradiat    splits as  LR,     improve=2.196026, (0 missing)
##       node_caps   splits as  RL,     improve=1.433499, (0 missing)
##   Surrogate splits:
##       age         splits as  -LLRRR,      agree=0.816, adj=0.568, (0 split)
##       breast_quad splits as  LLRLL,       agree=0.609, adj=0.081, (0 split)
##       tumor_size  splits as  ---RL--L---, agree=0.598, adj=0.054, (0 split)
## 
## Node number 11: 49 observations,    complexity param=0.01351351
##   predicted class=3  expected loss=0.6122449  P(node) =0.1768953
##     class counts:    15    15    19
##    probabilities: 0.306 0.306 0.388 
##   left son=22 (2 obs) right son=23 (47 obs)
##   Primary splits:
##       breast_quad splits as  RRRLR,       improve=2.465846, (0 missing)
##       node_caps   splits as  LR,          improve=1.893171, (0 missing)
##       irradiat    splits as  LR,          improve=1.217907, (0 missing)
##       age         splits as  LRRRR-,      improve=1.207596, (0 missing)
##       tumor_size  splits as  -----LR----, improve=1.060900, (0 missing)
## 
## Node number 12: 15 observations
##   predicted class=2  expected loss=0.2  P(node) =0.05415162
##     class counts:     0    12     3
##    probabilities: 0.000 0.800 0.200 
## 
## Node number 13: 47 observations,    complexity param=0.02702703
##   predicted class=3  expected loss=0.4255319  P(node) =0.1696751
##     class counts:     0    20    27
##    probabilities: 0.000 0.426 0.574 
##   left son=26 (24 obs) right son=27 (23 obs)
##   Primary splits:
##       breast_quad splits as  LRLRL,       improve=2.5528750, (0 missing)
##       inv_nodes   splits as  -RR-LLL,     improve=1.5080160, (0 missing)
##       breast      splits as  RL,          improve=1.2247570, (0 missing)
##       age         splits as  -RLRL-,      improve=0.7515622, (0 missing)
##       tumor_size  splits as  ----RLRR---, improve=0.7515622, (0 missing)
##   Surrogate splits:
##       breast    splits as  RL,      agree=0.872, adj=0.739, (0 split)
##       age       splits as  -LLRL-,  agree=0.681, adj=0.348, (0 split)
##       menopause splits as  R-L,     agree=0.660, adj=0.304, (0 split)
##       node_caps splits as  LR,      agree=0.596, adj=0.174, (0 split)
##       inv_nodes splits as  -LL-LRL, agree=0.574, adj=0.130, (0 split)
## 
## Node number 16: 17 observations,    complexity param=0.01126126
##   predicted class=1  expected loss=0.4117647  P(node) =0.06137184
##     class counts:    10     7     0
##    probabilities: 0.588 0.412 0.000 
##   left son=32 (15 obs) right son=33 (2 obs)
##   Primary splits:
##       tumor_size  splits as  LLL-----LRR, improve=1.9696920, (0 missing)
##       breast_quad splits as  -R-RL,       improve=1.1535550, (0 missing)
##       breast      splits as  RL,          improve=0.6976266, (0 missing)
##       age         splits as  -R--RL,      improve=0.5523776, (0 missing)
##       irradiat    splits as  RL,          improve=0.5523776, (0 missing)
## 
## Node number 17: 25 observations
##   predicted class=2  expected loss=0.32  P(node) =0.09025271
##     class counts:     7    17     1
##    probabilities: 0.280 0.680 0.040 
## 
## Node number 18: 12 observations
##   predicted class=1  expected loss=0.4166667  P(node) =0.0433213
##     class counts:     7     5     0
##    probabilities: 0.583 0.417 0.000 
## 
## Node number 19: 19 observations
##   predicted class=2  expected loss=0.4736842  P(node) =0.06859206
##     class counts:     4    10     5
##    probabilities: 0.211 0.526 0.263 
## 
## Node number 20: 50 observations,    complexity param=0.01013514
##   predicted class=2  expected loss=0.42  P(node) =0.1805054
##     class counts:    14    29     7
##    probabilities: 0.280 0.580 0.140 
##   left son=40 (42 obs) right son=41 (8 obs)
##   Primary splits:
##       irradiat    splits as  LR,          improve=3.9554440, (0 missing)
##       breast_quad splits as  RRRLR,       improve=1.4269000, (0 missing)
##       tumor_size  splits as  ---RR--L---, improve=1.3395510, (0 missing)
##       node_caps   splits as  LR,          improve=0.5521029, (0 missing)
##       age         splits as  -RLL--,      improve=0.3090612, (0 missing)
## 
## Node number 21: 37 observations,    complexity param=0.01216216
##   predicted class=3  expected loss=0.5945946  P(node) =0.133574
##     class counts:     8    14    15
##    probabilities: 0.216 0.378 0.405 
##   left son=42 (32 obs) right son=43 (5 obs)
##   Primary splits:
##       breast_quad splits as  LLLRR,  improve=2.6485070, (0 missing)
##       age         splits as  --RRRL, improve=2.2392370, (0 missing)
##       node_caps   splits as  RL,     improve=0.9948200, (0 missing)
##       irradiat    splits as  LR,     improve=0.9233306, (0 missing)
##       breast      splits as  RL,     improve=0.8710870, (0 missing)
##   Surrogate splits:
##       irradiat splits as  LR, agree=0.892, adj=0.2, (0 split)
## 
## Node number 22: 2 observations
##   predicted class=2  expected loss=0  P(node) =0.007220217
##     class counts:     0     2     0
##    probabilities: 0.000 1.000 0.000 
## 
## Node number 23: 47 observations
##   predicted class=3  expected loss=0.5957447  P(node) =0.1696751
##     class counts:    15    13    19
##    probabilities: 0.319 0.277 0.404 
## 
## Node number 26: 24 observations,    complexity param=0.01351351
##   predicted class=2  expected loss=0.4166667  P(node) =0.0866426
##     class counts:     0    14    10
##    probabilities: 0.000 0.583 0.417 
##   left son=52 (4 obs) right son=53 (20 obs)
##   Primary splits:
##       age         splits as  -RRRL-,  improve=2.4376950, (0 missing)
##       inv_nodes   splits as  -RR-RLL, improve=1.1694330, (0 missing)
##       breast_quad splits as  L-R-L,   improve=0.7116023, (0 missing)
##       irradiat    splits as  RL,      improve=0.3447022, (0 missing)
##       breast      splits as  LR,      improve=0.2885215, (0 missing)
##   Surrogate splits:
##       breast_quad splits as  L-R-R, agree=0.875, adj=0.25, (0 split)
## 
## Node number 27: 23 observations
##   predicted class=3  expected loss=0.2608696  P(node) =0.08303249
##     class counts:     0     6    17
##    probabilities: 0.000 0.261 0.739 
## 
## Node number 32: 15 observations
##   predicted class=1  expected loss=0.3333333  P(node) =0.05415162
##     class counts:    10     5     0
##    probabilities: 0.667 0.333 0.000 
## 
## Node number 33: 2 observations
##   predicted class=2  expected loss=0  P(node) =0.007220217
##     class counts:     0     2     0
##    probabilities: 0.000 1.000 0.000 
## 
## Node number 40: 42 observations
##   predicted class=2  expected loss=0.3809524  P(node) =0.1516245
##     class counts:    13    26     3
##    probabilities: 0.310 0.619 0.071 
## 
## Node number 41: 8 observations,    complexity param=0.01013514
##   predicted class=3  expected loss=0.5  P(node) =0.02888087
##     class counts:     1     3     4
##    probabilities: 0.125 0.375 0.500 
##   left son=82 (4 obs) right son=83 (4 obs)
##   Primary splits:
##       breast      splits as  LR,          improve=3.2958370, (0 missing)
##       tumor_size  splits as  ---RL--L---, improve=1.7260920, (0 missing)
##       age         splits as  -LLR--,      improve=0.7648207, (0 missing)
##       breast_quad splits as  -LR--,       improve=0.6103747, (0 missing)
##   Surrogate splits:
##       age         splits as  -LRR--,      agree=0.750, adj=0.50, (0 split)
##       breast_quad splits as  -RL--,       agree=0.625, adj=0.25, (0 split)
##       tumor_size  splits as  ---RR--L---, agree=0.625, adj=0.25, (0 split)
## 
## Node number 42: 32 observations,    complexity param=0.01216216
##   predicted class=2  expected loss=0.5625  P(node) =0.1155235
##     class counts:     6    14    12
##    probabilities: 0.188 0.438 0.375 
##   left son=84 (14 obs) right son=85 (18 obs)
##   Primary splits:
##       breast_quad splits as  LLR--,       improve=1.6588870, (0 missing)
##       age         splits as  --RRRL,      improve=1.3359340, (0 missing)
##       node_caps   splits as  RL,          improve=0.8474846, (0 missing)
##       tumor_size  splits as  ---LR--L---, improve=0.4361799, (0 missing)
##       breast      splits as  RL,          improve=0.3107940, (0 missing)
##   Surrogate splits:
##       tumor_size splits as  ---RL--R---, agree=0.656, adj=0.214, (0 split)
##       age        splits as  --RRLR,      agree=0.625, adj=0.143, (0 split)
## 
## Node number 43: 5 observations
##   predicted class=3  expected loss=0.4  P(node) =0.01805054
##     class counts:     2     0     3
##    probabilities: 0.400 0.000 0.600 
## 
## Node number 52: 4 observations
##   predicted class=2  expected loss=0  P(node) =0.01444043
##     class counts:     0     4     0
##    probabilities: 0.000 1.000 0.000 
## 
## Node number 53: 20 observations,    complexity param=0.01351351
##   predicted class=2  expected loss=0.5  P(node) =0.07220217
##     class counts:     0    10    10
##    probabilities: 0.000 0.500 0.500 
##   left son=106 (8 obs) right son=107 (12 obs)
##   Primary splits:
##       inv_nodes  splits as  -RR-RLL,     improve=1.7260920, (0 missing)
##       age        splits as  -LLR--,      improve=1.2657560, (0 missing)
##       menopause  splits as  R-L,         improve=0.6485757, (0 missing)
##       tumor_size splits as  ----RLLR---, improve=0.4201185, (0 missing)
##       irradiat   splits as  RL,          improve=0.4027103, (0 missing)
##   Surrogate splits:
##       age        splits as  -LRR--,      agree=0.65, adj=0.125, (0 split)
##       tumor_size splits as  ----RRLR---, agree=0.65, adj=0.125, (0 split)
## 
## Node number 82: 4 observations
##   predicted class=2  expected loss=0.25  P(node) =0.01444043
##     class counts:     0     3     1
##    probabilities: 0.000 0.750 0.250 
## 
## Node number 83: 4 observations
##   predicted class=3  expected loss=0.25  P(node) =0.01444043
##     class counts:     1     0     3
##    probabilities: 0.250 0.000 0.750 
## 
## Node number 84: 14 observations,    complexity param=0.01216216
##   predicted class=2  expected loss=0.5  P(node) =0.05054152
##     class counts:     4     7     3
##    probabilities: 0.286 0.500 0.214 
##   left son=168 (6 obs) right son=169 (8 obs)
##   Primary splits:
##       tumor_size  splits as  ---LR--R---, improve=2.5310160, (0 missing)
##       age         splits as  --RLR-,      improve=1.5716460, (0 missing)
##       breast_quad splits as  RL---,       improve=0.7316467, (0 missing)
##       breast      splits as  LR,          improve=0.6214736, (0 missing)
##   Surrogate splits:
##       breast_quad splits as  LR---, agree=0.643, adj=0.167, (0 split)
## 
## Node number 85: 18 observations,    complexity param=0.01216216
##   predicted class=3  expected loss=0.5  P(node) =0.06498195
##     class counts:     2     7     9
##    probabilities: 0.111 0.389 0.500 
##   left son=170 (10 obs) right son=171 (8 obs)
##   Primary splits:
##       breast     splits as  RL,          improve=2.4487940, (0 missing)
##       age        splits as  --RRLL,      improve=2.1119580, (0 missing)
##       tumor_size splits as  ---RR--L---, improve=1.9564520, (0 missing)
##       node_caps  splits as  RL,          improve=0.9912506, (0 missing)
## 
## Node number 106: 8 observations
##   predicted class=2  expected loss=0.25  P(node) =0.02888087
##     class counts:     0     6     2
##    probabilities: 0.000 0.750 0.250 
## 
## Node number 107: 12 observations
##   predicted class=3  expected loss=0.3333333  P(node) =0.0433213
##     class counts:     0     4     8
##    probabilities: 0.000 0.333 0.667 
## 
## Node number 168: 6 observations
##   predicted class=1  expected loss=0.5  P(node) =0.02166065
##     class counts:     3     1     2
##    probabilities: 0.500 0.167 0.333 
## 
## Node number 169: 8 observations
##   predicted class=2  expected loss=0.25  P(node) =0.02888087
##     class counts:     1     6     1
##    probabilities: 0.125 0.750 0.125 
## 
## Node number 170: 10 observations
##   predicted class=2  expected loss=0.5  P(node) =0.03610108
##     class counts:     2     5     3
##    probabilities: 0.200 0.500 0.300 
## 
## Node number 171: 8 observations
##   predicted class=3  expected loss=0.25  P(node) =0.02888087
##     class counts:     0     2     6
##    probabilities: 0.000 0.250 0.750
rpart.plot(fit,type=4,extra=2,roundint=FALSE)

Comparison of decision trees

# The decision tree created with default settings has a height of 7, while the decision tree created with custom settings has a height of 8. The default tree will yield a faster speed of prediction, and will not be as likely to be overfitting of our sample data. However, further testing will likely generate a tree that is less subject to overfitting with a faster speed of prediction.

1.6.3 Building a bossted tree with holdout validation

Cleaning data for use in boosted tree generation

# Since the response must be 0 and 1, we mutate the class attribute:
breast_cancer <- breast_cancer %>% mutate(recur_label=ifelse(class=="recurrence-events", 1, 0))

Splitting data into two sets for training and testing (holdout validation)

set.seed(52)
# Shuffling row indices
rows <- sample(nrow(breast_cancer))

# Randomly reordering rows of dataset using rows vector.
breast_cancer <- breast_cancer[rows,]

split <- round(nrow(breast_cancer)*0.66)
train <- breast_cancer[1:split, ]
test <- breast_cancer[split+1:nrow(breast_cancer), ]

Creating boosted tree

trainFormula <- recur_label~deg_malig+age+breast+breast_quad+inv_nodes+irradiat+menopause+node_caps+tumor_size
trainX <- build.x(trainFormula,train,contrasts=FALSE)
trainY <- build.y(trainFormula,train)
trainBoost <- xgboost::xgboost(
  data=trainX,
  label=trainY,
  max.depth=3,
  eta=0.3,
  #nthread=4,
  nrounds=10,
  objective="binary:logistic")
## [1]  train-error:0.224044 
## [2]  train-error:0.213115 
## [3]  train-error:0.218579 
## [4]  train-error:0.234973 
## [5]  train-error:0.218579 
## [6]  train-error:0.196721 
## [7]  train-error:0.202186 
## [8]  train-error:0.191257 
## [9]  train-error:0.191257 
## [10] train-error:0.196721
xgb.plot.multi.trees(
  trainBoost,
  feature.names=colnames(trainX)
)
## Column 2 ['No'] of item 2 is missing in item 1. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names. use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for  backwards compatibility. See news item 5 in v1.12.2 for options to control this message.

Single tree plots

xgb.plot.tree(model=trainBoost, trees=2:3, render=TRUE)

1.6.4 Ranking variables in terms of their importance to the boosted tree

Using xgb

importance_matrix <- xgb.importance(
  model = trainBoost,
  feature_names =colnames(trainX)
  )
print(importance_matrix)
##                  Feature        Gain      Cover  Frequency
##  1:            deg_malig 0.301240180 0.18227566 0.10909091
##  2:         inv_nodes0-2 0.150060532 0.12733374 0.10909091
##  3:      tumor_size10-14 0.064638853 0.12540951 0.07272727
##  4:  breast_quadleft_low 0.059999214 0.02371011 0.10909091
##  5:          node_capsno 0.049320091 0.02367454 0.03636364
##  6:             age30-39 0.046434893 0.07822831 0.05454545
##  7:      tumor_size30-34 0.044341436 0.03115341 0.07272727
##  8:           irradiatno 0.038286389 0.03753707 0.05454545
##  9:  breast_quadright_up 0.036663111 0.05114478 0.03636364
## 10:           breastleft 0.035637076 0.03344851 0.07272727
## 11:     menopausepremeno 0.034758201 0.05609384 0.05454545
## 12:      tumor_size20-24 0.030920365 0.05056489 0.05454545
## 13:             age60-69 0.029433021 0.01526813 0.03636364
## 14: breast_quadright_low 0.025520765 0.03538208 0.03636364
## 15:      tumor_size40-44 0.017492958 0.04838156 0.03636364
## 16:      tumor_size35-39 0.016262771 0.02807589 0.01818182
## 17:             age50-59 0.011874616 0.02942488 0.01818182
## 18:      tumor_size50-54 0.007115528 0.02289308 0.01818182
xgb.plot.importance(importance_matrix = importance_matrix)

Using vip

vip::vip(trainBoost,num_features=18)

1.6.5 Constructing random forest model

trainY <- build.y(trainFormula,train)
boostedForest <- xgboost(
  data=trainX,
  label=trainY,
  max_depth=4,
  num_parallel_tree=1000,
  subsample=0.5,
  colsample_bytree=0.5,
  nrounds=3,
  objective="binary:logistic"
)
## [1]  train-error:0.224044 
## [2]  train-error:0.224044 
## [3]  train-error:0.229508
xgb.plot.multi.trees(
  boostedForest,
  feature.names=colnames(trainX)
)
## Column 2 ['No'] of item 2 is missing in item 1. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names. use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for  backwards compatibility. See news item 5 in v1.12.2 for options to control this message.